#!/usr/bin/perl
#Name    	: 4_Secondary_validation.pl
#Author  	: Morgan, Matthew
#Created 	: 10/2010
#Modified	: 03/2012
#Purpose	: Secondary Validation within samples.  Evaluates each sequence as a potential error given difference to other sequences and read number.
#Syntax		: 4_Secondary_validation.pl <Preliminary Validated sequences> <First MID number(starts at 1)> <Number of MIDs> 
#Further info	: Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
#Acknowledgment	: Code for retrieving aligned sequences from MUSCLE output was borrowed and modifed from OCTUPUS (v0.1.1, available at http://octupus.sourceforge.net/)
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.

#########################################################################################################################################################	
#																			#
#CSIRO Open Source Software License Agreement (GPLv3)													#
#																			#
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.					#
#																			#
#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3				#
# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material.	#
#The following additional terms apply under clause 7 of that license:											#
#																			#
#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS		#
#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS,	#
#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,		#
#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.				#
#																			#
#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT		#
#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.		#
#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS,		#
#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS;		#
#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE		#
#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF		#
#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.														#
#																			#
#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS		#
#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR			#
#MODIFIED "CONSUMER GUARANTEES".  IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL		#
#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION.  WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF		#
#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM:							#
#1.          THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;					#
#2.          THE REPAIR OF THE SOFTWARE; 														#
#3.          THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN,		#
#	     OR HAVING THE SOFTWARE REPAIRED.														#
#																			#
#########################################################################################################################################################

use warnings;
use strict;

system 'rm validation*';

#Prompt for cutoff values for expected PCR error and chimera formation rates

print "Set PCR error cut-off (default value = 0.02) : ";
my $pcrrate = <STDIN>;
chomp($pcrrate);
if ( !$pcrrate ) {
    $pcrrate = 0.02;
}
print "\nPCR error cut-off set to $pcrrate\n\n";
print "Set chimera error cut-off (default = 0.15) : ";
my $chimrate = <STDIN>;
chomp($chimrate);
if ( !$chimrate ) {
    $chimrate = 0.15;
}
print "\nChimera error cut-off set to $chimrate\n\n";
my %valid;
my %invalid;
open (COMP, ">Indel_sequence_pairs.txt");

#Evalute all sequences in each MID (sample) independently, then bring combined validation results together

MID: for ( my $m = $ARGV[1] - 1 ; $m < $ARGV[1] + $ARGV[2] - 1 ; $m++ ) {
    my %totalreads = ();
    my %exclude    = ();
    my %seqs       = ();
    my %counts     = ();
    my $sum        = 0;
    my $min_reads  = 1;
    open( SEQS, "<$ARGV[0]" );
    SEQ: while (<SEQS>) {
	if (/^name/) {
		next SEQ;
	}        
	chomp;
        s/\r//g;
        my @first = split( /\s+/, $_ );
        $totalreads{ $first[0] } = $first[5];
        my $r             = $m + 6;
        my $numberofreads = $first[$r];
        $sum += $numberofreads;
    }
    close(SEQS);
    if ( $sum == 0 ) {
        next MID;
    }
    open( SEQS, "<$ARGV[0]" );
    my $lines;
    while (<SEQS>) {
        unless (/^name/) {
            $lines++;
            chomp;
            my @tmp    = split( /\t/, $_ );
            my $name   = $tmp[0];
            my $seq    = $tmp[4];
            my $r      = $m + 6;
            my $nreads = $tmp[$r];
            unless ( $nreads <= $min_reads ) {
                $seqs{$name}   = $seq;
                $counts{$name} = $nreads;
            }
        }
    }
    close(SEQS);
    $seqs{'dummy'}   = 'TACG';
    $counts{'dummy'} = 1;
    print "Read $lines lines\n";

#sort sequences by abundance in the sample

    my @abun = sort { $counts{$b} <=> $counts{$a} } keys %counts;
    my $n    = scalar(@abun);
    my $took = time - $^T;
    my $ex   = scalar( keys %exclude );
    print "Running for $took seconds\n";
    my $v  = scalar( keys %valid );
    my $in = scalar( keys %invalid );
    print "Valid:\t$v\n";
    print "Invalid:\t$in\n";
    my $midno = $m+1;
    print "\nMID:\t$midno\n";
    my $rels = $n-1;
    print "There are $rels sequences to evaluate\n";
    my $n1    = 0;
    my $n2    = 0;
    my $cc    = 0;
    my $count = 0;
    my %seen = ();

#Perform all unique three-way alignments with two most abundant sequences are potential 'parent' sequences and less abundant seuqence as potential 'daughter'

    for ( my $i = 0 ; $i < $n ; $i++ ) {
        my $parent1 = $abun[$i];
        for ( my $j = $i + 1 ; $j < $n ; $j++ ) {
            my $parent2 = $abun[$j];
	    for ( my $k = $j + 1 ; $k < $n ; $k++ ) {
                my $cand = $abun[$k];
                open( TEMP, ">temp.fna" );
                print TEMP ">P1\n$seqs{$parent1}\n";
                print TEMP ">P2\n$seqs{$parent2}\n";
                print TEMP ">CC\n$seqs{$cand}\n";
                close(TEMP);
                $count++;
                print "$count\r";

#run MUSCLE alignment using parameters optimised for speed and accuracy

                system
                  "muscle -in temp.fna -out temp.aln -maxiters 1 -diags -quiet";
		
		#Code for retrieving aligned sequences borrowed and modified from OCTUPUS v0.1.1 (http://octupus.sourceforge.net/)
		#Start
                local $/ = ">";
                open( ALIGN, "<temp.aln" ) || die "Can't find alignment";
                my %hash  = ();
                my %names = ();
                my $len;
                while (<ALIGN>) {
                    chomp;
                    my ( $nm, $sq ) = split( /\n/, $_, 2 );
                    next unless ( $nm && $sq );
                    $nm =~ s/^>//g;
                    $nm =~ s/[\n\r]//g;
                    $sq =~ s/[\n\r]//g;
                    $hash{$nm} = $sq;
                    $len = length($sq);
                }
		#End
                my $current = 0;
                my $changes = 0;
                my $new;
                my $indels = 0;
                my $subs   = 0;
                my $nID;
                my @pos     = ();
                my @confirm = ();
                my $reject  = 0;

#Parse alignments: track the type (substitution or gap) and number of differences between all three sequences

                for ( my $x = 0 ; $x < $len ; $x++ ) {
                    if (
                        (
                            substr( $hash{'P1'}, $x, 1 ) eq
                            substr( $hash{'P2'}, $x, 1 )
                        )
                        && (
                            substr( $hash{'P1'}, $x, 1 ) ne
                            substr( $hash{'CC'}, $x, 1 ) )
                      )
                    {

                        $reject = 1;
                        $nID++;

                    }
                    elsif (
                        substr( $hash{'P1'}, $x, 1 ) eq
                        substr( $hash{'P2'}, $x, 1 ) )
                    {

                        $nID++;

                    }
                    elsif (
                        substr( $hash{'P1'}, $x, 1 ) eq
                        substr( $hash{'CC'}, $x, 1 ) )
                    {

                        $new = '1';
                        if ( $new == $current ) {
                            push @confirm, $x + 1;

                        }
                        else {
                            $current = $new;
                            $changes++;
                            push @pos, $x + 1;
                        }

                        if (   ( substr( $hash{'P1'}, $x, 1 ) eq "-" )
                            || ( substr( $hash{'P2'}, $x, 1 ) eq "-" ) )
                        {
                            $indels++;
                        }
                        else {
                            $subs++;
                        }
                    }
                    elsif (
                        substr( $hash{'P2'}, $x, 1 ) eq
                        substr( $hash{'CC'}, $x, 1 ) )
                    {

                        $new = '2';
                        if ( $new == $current ) {
                            push @confirm, $x + 1;

                        }
                        else {
                            $current = $new;
                            $changes++;
                            push @pos, $x + 1;
                        }
                        if (   ( substr( $hash{'P1'}, $x, 1 ) eq "-" )
                            || ( substr( $hash{'P2'}, $x, 1 ) eq "-" ) )
                        {
                            $indels++;
                        }
                        else {
                            $subs++;
                        }
                    }
                    else {
                        $reject = 1;

                        if (   ( substr( $hash{'P1'}, $x, 1 ) eq "-" )
                            || ( substr( $hash{'P2'}, $x, 1 ) eq "-" ) )
                        {
                            $indels++;
                        }
                        else {
                            $subs++;
                        }
                    }
                }

#Parse sequence difference data to assess whether any sequence could be derived from another based on the type of differences, the number of differences, and the relative number of reads for the putative real and error sequences.

                if ( ( $indels > 0 ) && ( $subs == 0 ) ) {
                    if ( $counts{$parent1} > 10 ) {
			unless ($counts{$parent1}==$counts{$parent2}) {    #should add this so that equal numbers >10 both get validated.                    
				$exclude{$parent2}++ ;
				my $comb = $parent2 . "_" . $parent1;
				if ( !defined ($seen{$comb})) { 				
					print COMP "$m\t$parent2\tIn\t$parent1\n"; #use this file to find complementary indels in script5 validation			
					$seen{$comb} = 1;				
				}
			}
                    }
                    elsif ( $totalreads{$parent1} > $totalreads{$parent2} ) {
                        $exclude{$parent2}++;
			my $comb = $parent2 . "_" . $parent1;			
			if ( !defined ($seen{$comb})) { 				
				print COMP "$m\t$parent2\tIn\t$parent1\n"; #use this file to find complementary indels in script5 validation			
				$seen{$comb} = 1;
			}			
                    }
                    else {
                        $exclude{$parent1}++;
			my $comb = $parent1 . "_" . $parent2;                    	
			if ( !exists ($seen{$comb})) { 				
				print COMP "$m\t$parent1\tIn\t$parent2\n"; #use this file to find complementary indels in script5 validation			
				$seen{$comb} = 1;
			}			
		    }
                }
                else {
                    my $cut = ( $pcrrate**$subs );
                    if ( $counts{$parent2} < ( $counts{$parent1} * $cut ) ) {
                        $exclude{$parent2}++;
                    }
                }
                unless ( $reject == 1 ) {
                    if ( $changes >= 2 ) {
                        my $chexp = $changes - 1;
                        my $max   = $chimrate**$chexp;
                        if ( $counts{$cand} <= ( $counts{$parent2} * $max ) ) {
                            $exclude{$cand}++;
                        }
                    }
                }
            }
        }
    }
    
#Print output

open( MID, ">>validation.mid$m" );
    foreach my $s ( sort keys %seqs ) {
        if ( exists( $exclude{$s} ) ) {
            $invalid{$s}++;
            print MID "$s\tINVALID\n";
        }
        else {
            $valid{$s}++;
            print MID "$s\tVALID\n";
        }
    }
    close(MID);
}
close (COMP);
print "Valids\n\n";
for my $z ( sort keys %valid ) {
    print "$z\t$valid{$z}\n";
}

print "Invalids\n\n";
for my $y ( sort keys %invalid ) {
    print "$y\t$invalid{$y}\n";
}

#Move all output files to same location for use with final (fifth) script

system 'mkdir Validation_by_sample';
system 'mv validation.mid* Validation_by_sample';

my $took = time - $^T;
print "Took $took seconds to complete\n";
print "DONE\n";
